global username = c(username)
global dropbox "/Users/${username}/Dropbox"
global master_file_path "${dropbox}/Engel_GFT/replication_files"

global input "$master_file_path\data/intermediate_data"
global output "$master_file_path\data/intermediate_data/conventional_price_indices"

cd "$master_file_path\data\intermediate_data"
set more off


**************************************************************
****      Part 1: Consumption within each round        *******
**************************************************************

if 1==1 {
	
  *(1) round 43
   set more off

	use "R43R55/expenditures_item_level_R43R55_i_groupsV1.dta" if round==43, clear
	
    *drop households with flagged discrepancies between sum of exp. and mpce
    drop if flag_mpce==1

    *drop goods with no quantity data
    drop if quantity==.
    drop if quantity==0

    *Keep only goods in our 34 i groupings:
    drop if flag_deaton_drops==1

	replace concorded_itemlabel2=trim(concorded_itemlabel2)

	*Get rid of products beyond fuel and light:
	drop if round==43 & itemcode43>=480
	drop if round==55 & Srl_no_of_item>=360

    *calculate expenditures and quantities at the hhid X item level:
    collapse (sum) expenditure quantity, by(state43 district43 sector wt hhsize mpce hhid concorded_itemlabel2 i_groupsV1 g_groupsV3)

    gen price=expenditure/quantity
    replace price=round(price, 0.01)

    *Angus' automatic test for unit price outliers:
    gen log_price=log(price)
    egen sd_log_price=sd(log_price), by(concorded_itemlabel2)
    egen mean_log_price=mean(log_price), by(concorded_itemlabel2)
    gen diff=abs(log_price-mean_log_price)
    gen compare=0
    replace compare=1 if diff>2*sd_log_price
    drop if compare==1
    drop compare diff mean_log_price sd_log_price log_price

    bysort concorded_itemlabel2 state43 sector: egen price_state=median(price)
    replace price_state=round(price_state, 0.01)

    bysort concorded_itemlabel2 state43 district43 sector: egen price_district=median(price)
    replace price_district=round(price_district, 0.01)
	
    egen item_seq = group(concorded_itemlabel2), label
    egen i_groupsV1_seq=group(i_groupsV1), label

    preserve
    
	duplicates drop item_seq concorded_itemlabel2 i_groupsV1 g_groupsV3, force
    keep concorded_itemlabel2 item_seq i_groupsV1 i_groupsV1_seq
    *save "${output}\item_seq_igroupsV1.dta", replace
	save "R43R55\item_seq_igroupsV1.dta", replace

    *Household level again here:
    restore


    keep expenditure quantity price_state price_district state43 sector district43 hhid wt hhsize mpce item_seq i_groupsV1_seq

    egen sum_exp=total(expenditure), by(state43 sector district43 hhid)

    *Household level dataset with expenditures and prices at the item level:

    gen igroupsV1_itemseq=i_groupsV1_seq*10000+item_seq

        reshape wide expenditure quantity price_state price_district , i(state43 district43 sector wt hhsize mpce hhid sum_exp i_groupsV1 item_seq) j(igroupsV1_itemseq)

		
    *Fill in with 0 in case of no purchases:
         foreach var of varlist expenditure* quantity* {
	     replace `var'=0 if `var'==.
		 }

    *create 9 deciles of income for each districtXsector and for each stateXsector (for democratic CPI) (centered at 6-15, 16-25, etc.):
    egen decile_st_bounds= xtile(sum_exp), by(state43 sector) nq(20)
    egen decile_st= xtile(sum_exp) if decile_st_bounds!=1 & decile_st_bounds!=20, by(state43 sector) nq(9)
    drop decile_st_bounds

    egen decile_dist_bounds= xtile(sum_exp), by(state43 sector district43) nq(20)
    egen decile_dist= xtile(sum_exp) if decile_dist_bounds!=1 & decile_dist_bounds!=20, by(state43 sector district43) nq(9)
    drop decile_dist_bounds

   *Create district by decile budget shares (plutocratic):
        *Total expenditure by decile district:
         bysort state43 sector district43 decile_dist: egen tot_exp_dist=total(sum_exp*wt)

         *Expenditure on each good by decile district and sector:
         foreach var of varlist expenditure* {
         loc a=substr("`var'",12,.)
         egen exp_`a'=total(`var'*wt) ,  by(state43 sector district43 decile_dist)
         gen pluto_share_`a'=exp_`a'/tot_exp_dist
		 }
         drop exp_* tot_exp_dist


    *Create district by decile budget shares (democratic):
         *Average hhd share on each good by district:
         foreach var of varlist expenditure* {
         loc a=substr("`var'",12,.)
         gen aux_`a'=(`var'/sum_exp)
         egen demo_share_`a'=wtmean(aux_`a') , weight(wt)  by(state43 sector district43 decile_dist)
		 }

 
    *fill district level prices:
     foreach var of varlist price_district* {
     loc a=substr("`var'",15,.)
	*can use the max since they are all the same:
    bysort state43 sector district43: egen price_dist_`a'=max(`var')
	 cap drop `var'
	 }


    *fill state level prices:
  foreach var of varlist price_state* {
     loc a=substr("`var'",12,.)
	*can use the max since they are all the same:
    bysort state43 sector : egen price_st_`a'=max(`var')
	 cap drop `var'
	 }


    egen dist_wt=total(wt), by(state43 sector district43)

    duplicates drop state43 sector district43 decile_dist, force

	drop if decile_dist==.

   keep state43 sector district43 decile_dist price_dist_* pluto_share_* demo_share_* price_st_* dist_wt

   save "$output/R43_district_decile_level_V1_D", replace

*end of if 1==1:
}



if 1==1 {
	
  *(1) round 55
   set more off

	use "R43R55/expenditures_item_level_R43R55_i_groupsV1.dta" if round==55, clear
	
    *drop households with flagged discrepancies between sum of exp. and mpce
    drop if flag_mpce==1

    *drop goods with no quantity data
    drop if quantity==.
    drop if quantity==0

    *Keep only goods in our 34 i groupings:
    drop if flag_deaton_drops==1

	replace concorded_itemlabel2=trim(concorded_itemlabel2)

	*Make list of goods consistent over rounds:
	*note: other cereals only drops in R55:
    drop if concorded_itemlabel2=="Other cereals"

	*Get rid of products beyond fuel and light:
	drop if round==43 & itemcode43>=480
	drop if round==55 & Srl_no_of_item>=360

	replace quantity=quantity/100
	replace quantity=round(quantity, 0.01)

    *calculate expenditures and quantities at the hhid X item level:
    collapse (sum) expenditure quantity, by(state43 district43 sector wt hhsize mpce hhid concorded_itemlabel2 i_groupsV1 g_groupsV3)

    gen price=expenditure/quantity
    replace price=round(price, 0.01)

    *Angus' automatic test for unit price outliers:
    gen log_price=log(price)
    egen sd_log_price=sd(log_price), by(concorded_itemlabel2)
    egen mean_log_price=mean(log_price), by(concorded_itemlabel2)
    gen diff=abs(log_price-mean_log_price)
    gen compare=0
    replace compare=1 if diff>2*sd_log_price
    drop if compare==1
    drop compare diff mean_log_price sd_log_price log_price

    bysort concorded_itemlabel2 state43 sector: egen price_state=median(price)
    replace price_state=round(price_state, 0.01)

    bysort concorded_itemlabel2 state43 district43 sector: egen price_district=median(price)
    replace price_district=round(price_district, 0.01)
	
    egen item_seq = group(concorded_itemlabel2), label
    egen i_groupsV1_seq=group(i_groupsV1), label

    keep expenditure quantity price_state price_district state43 sector district43 hhid wt hhsize mpce item_seq i_groupsV1_seq

    egen sum_exp=total(expenditure), by(state43 sector district43 hhid)

    *Household level dataset with expenditures and prices at the item level:

    gen igroupsV1_itemseq=i_groupsV1_seq*10000+item_seq

        reshape wide expenditure quantity price_state price_district , i(state43 district43 sector wt hhsize mpce hhid sum_exp i_groupsV1 item_seq) j(igroupsV1_itemseq)

    *Fill in with 0 in case of no purchases:
         foreach var of varlist expenditure* quantity* {
	     replace `var'=0 if `var'==.
		 }

* not needed for district level prices:
    *create 9 deciles of income for each districtXsector and for each stateXsector (for democratic CPI) (centered at 6-15, 16-25, etc.):
    egen decile_st_bounds= xtile(sum_exp), by(state43 sector) nq(20)
    egen decile_st= xtile(sum_exp) if decile_st_bounds!=1 & decile_st_bounds!=20, by(state43 sector) nq(9)
    drop decile_st_bounds

    egen decile_dist_bounds= xtile(sum_exp), by(state43 sector district43) nq(20)
    egen decile_dist= xtile(sum_exp) if decile_dist_bounds!=1 & decile_dist_bounds!=20, by(state43 sector district43) nq(9)
    drop decile_dist_bounds

   *Create district by decile budget shares (plutocratic):
        *Total expenditure by decile district:
         bysort state43 sector district43 decile_dist: egen tot_exp_dist=total(sum_exp*wt)

         *Expenditure on each good by decile district and sector:
         foreach var of varlist expenditure* {
         loc a=substr("`var'",12,.)
         egen exp_`a'=total(`var'*wt) ,  by(state43 sector district43 decile_dist)
         gen pluto_share_`a'=exp_`a'/tot_exp_dist
		 }
         drop exp_* tot_exp_dist


    *Create district by decile budget shares (democratic):
         *Average hhd share on each good by district:
         foreach var of varlist expenditure* {
         loc a=substr("`var'",12,.)
         gen aux_`a'=(`var'/sum_exp)
         egen demo_share_`a'=wtmean(aux_`a') , weight(wt)  by(state43 sector district43 decile_dist)
		 }


    *fill district level prices:
  foreach var of varlist price_district* {
     loc a=substr("`var'",15,.)
	*can use the max since they are all the same:
    bysort state43 sector district43: egen price_dist_`a'=max(`var')
	 cap drop `var'
	 }


    *fill state level prices:
  foreach var of varlist price_state* {
     loc a=substr("`var'",12,.)
	*can use the max since they are all the same:
    bysort state43 sector : egen price_st_`a'=max(`var')
	 cap drop `var'
	 }


    egen dist_wt=total(wt), by(state43 sector district43)

    duplicates drop state43 sector district43 decile_dist, force


   keep state43 sector district43  decile_dist  price_dist_* pluto_share_* demo_share_* price_st_* dist_wt

   drop if decile_dist==.
   
   save "$output/R55_district_decile_level_V1_D", replace

 
   
*end of 1==1:
}








**************************************************************
****        Part 2: Construct CPI: across rounds       *******
****
*State level (Paasche and Laspeyres, plutocratic and democratic)
*District level (Paasche and Laspeyres, plutocratic and democratic)
**************************************************************


**************************************************************
*(1) District level (Paasche and Laspeyres, plutocratic and democratic), R43, R55 for 34 i items
**************************************************************

if 1==1{
local round1 "43"
local round2 "55"
forvalues i=1/1 {
local initial: word `i' of `round1'
local final: word `i' of `round2'

   set more off
	 use  "$output/R`final'_district_decile_level_V1_D", clear
	
*Recall we are using median prices at district or state level.
*Replace median P with log(median P) so we can calculate changes:

    foreach var of varlist price_dist_*  {
    local i=substr("`var'",12,6)	
    qui gen  ln_`var' =log(price_dist_`i')
    drop `var'
    }

    foreach var of varlist  price_st_* {
    local i=substr("`var'",10,6)	
    qui gen ln_`var'=log(price_st_`i')
    drop `var'
    }

    order sector state43 district43 dist_wt
    *rename with round first:
	foreach var of varlist pluto_share_10005-ln_price_st_340116 {
	    rename `var' _`final'_`var'
		}

	merge 1:1 state43 sector district43 decile_dist using "$output/R`initial'_district_decile_level_V1_D"
    keep if _merge==3
	drop _merge

    foreach var of varlist price_dist_*  {
    local i=substr("`var'",12,6)	
    qui gen  ln_`var' =log(price_dist_`i')
    drop `var'
    }

    foreach var of varlist  price_st_* {
    local i=substr("`var'",10,6)	
    qui gen ln_`var'=log(price_st_`i')
    drop `var'
    }

    order sector state43 district43 decile_dist dist_wt
    *rename with round first:
	foreach var of varlist pluto_share_10005-ln_price_st_340116 {
	    rename `var' _`initial'_`var'
		}

*Calculate log price differences:

	 foreach var of varlist _`initial'_ln_price_dist_* {
	   loc a=substr("`var'",19,.)
	    qui gen d_ln_p_dist_`final'_`initial'_`a'= _`final'_ln_price_dist_`a'-_`initial'_ln_price_dist_`a'

        *replace with state price change if missing at district level:
        qui replace d_ln_p_dist_`final'_`initial'_`a'=_`final'_ln_price_st_`a'-_`initial'_ln_price_st_`a' if d_ln_p_dist_`final'_`initial'_`a'==.

        *replace with national price change if missing at state level:
        qui egen national_final=median(_`final'_ln_price_st_`a')
        qui egen national_initial=median(_`initial'_ln_price_st_`a')
        qui replace d_ln_p_dist_`final'_`initial'_`a'=national_final-national_initial if d_ln_p_dist_`final'_`initial'_`a'==.
        drop national_initial national_final _`final'_ln_price_dist_`a' _`initial'_ln_price_dist_`a' _`final'_ln_price_st_`a' _`initial'_ln_price_st_`a'
	   }

  

*Calculate weighted average change in prices for the 34 i's:
*This is done in two cases separately, for item i from 1-9 and item i- 10-34 (2 digits):

    tempfile dprices
    save `dprices'

    foreach var of varlist  _43_demo_share_* {
    use `dprices',clear
    if regexm("`var'","_[0-9][0-9][0-9][0-9][0-9]$")==1 {
    local i=substr("`var'",16,1)
    local item=substr("`var'",18,3)

    *This drop variables that have 6 digits than `i'0`item':
        foreach var2 of varlist  _43_demo_share_* {
        if regexm("`var2'","_[0-9][0-9][0-9][0-9][0-9]$")==0 {
        local d=substr("`var2'",16,.)
        drop _*_*_share_`d' d_ln_p_dist_*_*_`d'
        }
        }

    *Keep only items in `i' (should only have variables with 5 digits here):
    keep sector-dist_wt  _*_*_share_`i'0*    d_ln_p_dist_*_*_`i'0*
    gen good_i=`i'
    order good_i, after(dist_wt)

    *gen sum of budget shares (these are only the items that are components of good i):
    egen sum_demo_shares_`i'=rowtotal(_43_demo_share_*), mis
    egen sum_pluto_shares_`i'=rowtotal(_43_pluto_share_*), mis

	*Calculate weighted averages (using initial shares): 
    foreach var3 of varlist d_ln_p_dist_`final'_`initial'_`i'0* {
	   loc a=substr("`var3'",21,.)
	   gen aux_d_ln_p_dist_`final'_`initial'_`i'0`a'=`var3'*(_`initial'_pluto_share_`i'0`a')
       gen aux_d_ln_d_dist_`final'_`initial'_`i'0`a'=`var3'*(_`initial'_demo_share_`i'0`a')
	   }
    egen sum_d_ln_p_dist_`final'_`initial'_`i'=rowtotal(aux_d_ln_p_dist_`final'_`initial'_`i'0*), mis
    egen sum_d_ln_d_dist_`final'_`initial'_`i'=rowtotal(aux_d_ln_d_dist_`final'_`initial'_`i'0*), mis

    gen d_ln_p_pluto_`final'_`initial'_`i'=sum_d_ln_p_dist_`final'_`initial'_`i'/sum_pluto_shares_`i'
    gen d_ln_p_demo_`final'_`initial'_`i'=sum_d_ln_d_dist_`final'_`initial'_`i'/sum_demo_shares_`i'
}


    *The case in which the variable name refers to item i from 10-34:
    if regexm("`var'","_[0-9][0-9][0-9][0-9][0-9][0-9]$")==1 {
    local i=substr("`var'",16,2)
    local item=substr("`var'",19,3)

    *This drop variables that have 5 digits than `i'0`item':
        foreach var2 of varlist  _43_demo_share_* {
        if regexm("`var2'","[0-9][0-9][0-9][0-9][0-9][0-9]$")==0 {
        local d=substr("`var2'",16,.)
        drop _*_*_share_`d' d_ln_p_dist_*_*_`d'
        }
        }

    *Keep only items in `i' (should only have variables with 6 digits here):
    keep sector-dist_wt  _*_*_share_`i'0*    d_ln_p_dist_*_*_`i'0*
    gen good_i=`i'
    order good_i, after(dist_wt)

    *gen sum of budget shares (these are only the items that are components of good i):
    egen sum_demo_shares_`i'=rowtotal(_43_demo_share_*), mis
    egen sum_pluto_shares_`i'=rowtotal(_43_pluto_share_*), mis

	*Calculate weighted averages:
    foreach var3 of varlist d_ln_p_dist_`final'_`initial'_`i'0* {
	   loc a=substr("`var3'",22,.)
	   gen aux_d_ln_p_dist_`final'_`initial'_`i'0`a'=`var3'*(_`initial'_pluto_share_`i'0`a')
        gen aux_d_ln_d_dist_`final'_`initial'_`i'0`a'=`var3'*(_`initial'_demo_share_`i'0`a')
	   }
    egen sum_d_ln_p_dist_`final'_`initial'_`i'=rowtotal(aux_d_ln_p_dist_`final'_`initial'_`i'0*), mis
    egen sum_d_ln_d_dist_`final'_`initial'_`i'=rowtotal(aux_d_ln_d_dist_`final'_`initial'_`i'0*), mis

    gen d_ln_p_pluto_`final'_`initial'_`i'=sum_d_ln_p_dist_`final'_`initial'_`i'/sum_pluto_shares_`i'
    gen d_ln_p_demo_`final'_`initial'_`i'=sum_d_ln_d_dist_`final'_`initial'_`i'/sum_demo_shares_`i'
}
    keep state43 sector district43 decile_dist dist_wt good_i d_ln_p_demo_`final'_`initial'_`i' d_ln_p_pluto_`final'_`initial'_`i'  sum_demo_shares_`i'  sum_pluto_shares_`i'
    tempfile d_ln_p_`final'_`initial'_item_`i'
    qui save `d_ln_p_`final'_`initial'_item_`i'', replace
    display "item `i' done"
}

*end round1 round2:
}


*Assemble all `i' datasets:

use `d_ln_p_55_43_item_1',clear
forvalues j=2/34{
append using `d_ln_p_55_43_item_`j''
}
label var good_i "i goods V3 1-34"
label var sum_demo_shares_1 "sum of hhd demo budget shares of items in i"
label var sum_pluto_shares_1 "sum of hhd pluto budget shares of items in i"
label var d_ln_p_pluto_55_43_1 "change in log price of good i (weighted by initial shares)"
label var d_ln_p_demo_55_43_1 "change in log price of good i (weighted by initial shares)"

*Some i-level goods are blank because there is no consumption in the district of any of the underlying individual items. Replace these cases with the average log price change in the state or country.
forvalues k=1/34 {
bysort state43 sector: egen state_d_ln_p_pluto_55_43_`k'=mean(d_ln_p_pluto_55_43_`k')
bysort sector: egen nat_d_ln_p_pluto_55_43_`k'=mean(d_ln_p_pluto_55_43_`k')

bysort state43 sector: egen state_d_ln_p_demo_55_43_`k'=mean(d_ln_p_demo_55_43_`k')
bysort sector: egen nat_d_ln_p_demo_55_43_`k'=mean(d_ln_p_demo_55_43_`k')

replace d_ln_p_pluto_55_43_`k'= state_d_ln_p_pluto_55_43_`k' if sum_demo_shares_`k'==0 & d_ln_p_pluto_55_43_`k'==.
replace d_ln_p_pluto_55_43_`k'= nat_d_ln_p_pluto_55_43_`k' if sum_demo_shares_`k'==0 & d_ln_p_pluto_55_43_`k'==.

replace d_ln_p_demo_55_43_`k'= state_d_ln_p_demo_55_43_`k' if sum_demo_shares_`k'==0 & d_ln_p_demo_55_43_`k'==.
replace d_ln_p_demo_55_43_`k'= nat_d_ln_p_demo_55_43_`k' if sum_demo_shares_`k'==0 & d_ln_p_demo_55_43_`k'==.

drop nat_d_ln_p_pluto_55_43_`k' state_d_ln_p_pluto_55_43_`k' nat_d_ln_p_demo_55_43_`k' state_d_ln_p_demo_55_43_`k'
}


save "$output/d_ln_p_goods_i_55_43_decile.dta", replace

*end if 1==2:
}



